import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
df= pd.read_csv(r"C:\Users\syed sahel\Downloads\customers_sample.csv")
df.head()
| CustomerID | Age | Gender | Annual_Income | Spending_Score | Favorite_Genre | |
|---|---|---|---|---|---|---|
| 0 | 1 | 54 | Female | 30260 | 81 | Romance |
| 1 | 2 | 67 | Male | 51855 | 65 | Comedy |
| 2 | 3 | 44 | Male | 56393 | 97 | Sci-Fi |
| 3 | 4 | 30 | Male | 82355 | 100 | Drama |
| 4 | 5 | 58 | Male | 12688 | 70 | Action |
3️⃣ Data Cleaning & Preprocessing#
df.isnull().sum()
CustomerID 0 Age 0 Gender 0 Annual_Income 0 Spending_Score 0 Favorite_Genre 0 dtype: int64
df = df.dropna()
df = df.drop_duplicates()
df = pd.get_dummies(df, drop_first=True)
df.info()
df.describe()
<class 'pandas.core.frame.DataFrame'> Int64Index: 500 entries, 0 to 499 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 500 non-null int64 1 Age 500 non-null int64 2 Annual_Income 500 non-null int64 3 Spending_Score 500 non-null int64 4 Gender_Male 500 non-null uint8 5 Gender_Other 500 non-null uint8 6 Favorite_Genre_Comedy 500 non-null uint8 7 Favorite_Genre_Documentary 500 non-null uint8 8 Favorite_Genre_Drama 500 non-null uint8 9 Favorite_Genre_Horror 500 non-null uint8 10 Favorite_Genre_Romance 500 non-null uint8 11 Favorite_Genre_Sci-Fi 500 non-null uint8 dtypes: int64(4), uint8(8) memory usage: 23.4 KB
| CustomerID | Age | Annual_Income | Spending_Score | Gender_Male | Gender_Other | Favorite_Genre_Comedy | Favorite_Genre_Documentary | Favorite_Genre_Drama | Favorite_Genre_Horror | Favorite_Genre_Romance | Favorite_Genre_Sci-Fi | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.0000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 |
| mean | 250.500000 | 43.312000 | 57371.108000 | 90.838000 | 0.4800 | 0.026000 | 0.206000 | 0.066000 | 0.182000 | 0.116000 | 0.106000 | 0.144000 |
| std | 144.481833 | 15.577161 | 24048.352901 | 12.699382 | 0.5001 | 0.159295 | 0.404836 | 0.248531 | 0.386231 | 0.320546 | 0.308146 | 0.351441 |
| min | 1.000000 | 16.000000 | 8000.000000 | 37.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 125.750000 | 30.750000 | 41230.000000 | 84.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 250.500000 | 44.000000 | 56877.000000 | 98.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 375.250000 | 56.000000 | 72737.250000 | 100.000000 | 1.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 500.000000 | 69.000000 | 120810.000000 | 100.000000 | 1.0000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
sns.pairplot(df)
plt.show()
plt.figure(figsize=(10,5))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.show()
✔ Histograms
df.hist(figsize=(12,10))
plt.show()
5️⃣ Feature Engineering
features = [
"Age",
"Annual_Income",
"Spending_Score",
]
X = df[features]
✔ Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
6️⃣ Apply Clustering (K-Means)
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i)
kmeans.fit(X_scaled)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss, marker='o')
plt.title("Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1036: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
OMP_NUM_THREADS=2
📌 Train Final K-Means Model (k=4 Example)
kmeans = KMeans(n_clusters=4, random_state=42)
df["Cluster"] = kmeans.fit_predict(X_scaled)
df.head()
| CustomerID | Age | Annual_Income | Spending_Score | Gender_Male | Gender_Other | Favorite_Genre_Comedy | Favorite_Genre_Documentary | Favorite_Genre_Drama | Favorite_Genre_Horror | Favorite_Genre_Romance | Favorite_Genre_Sci-Fi | Cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 54 | 30260 | 81 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 1 | 2 | 67 | 51855 | 65 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 3 | 44 | 56393 | 97 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 |
| 3 | 4 | 30 | 82355 | 100 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 4 | 5 | 58 | 12688 | 70 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
7️⃣ Visualize Clusters
✔ 2D Scatter Plot
plt.figure(figsize=(8,6))
sns.scatterplot(
x=df["Annual_Income"],
y=df["Spending_Score"],
hue=df["Cluster"],
palette="bright",
s=100
)
plt.title("Customer Segmentation")
plt.show()
✔ 3D Visualization
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df["Age"], df["Annual_Income"], df["Spending_Score"],
c=df["Cluster"], s=50)
ax.set_xlabel("Age")
ax.set_ylabel("Annual Income")
ax.set_zlabel("Spending Score")
plt.title("3D Customer Segmentation")
plt.show()
8️⃣ PCA Visualization (Dimensionality Reduction)
pca = PCA(n_components=2)
pca_data = pca.fit_transform(X_scaled)
df['PC1'] = pca_data[:, 0]
df['PC2'] = pca_data[:, 1]
sns.scatterplot(x='PC1', y='PC2', data=df, hue='Cluster', palette='tab10', s=100)
plt.title("PCA Cluster Visualization")
plt.show()
9️⃣ Cluster Summary (Insights)
cluster_summary = df.groupby("Cluster")[["Age", "Annual_Income", "Spending_Score"]].mean()
cluster_summary
| Age | Annual_Income | Spending_Score | |
|---|---|---|---|
| Cluster | |||
| 0 | 58.085714 | 39166.657143 | 70.485714 |
| 1 | 30.379845 | 79309.782946 | 99.705426 |
| 2 | 30.500000 | 37240.032258 | 96.233871 |
| 3 | 55.323944 | 68481.190141 | 93.119718 |
🔟 Dashboard (Plotly)
import plotly.express as px
fig = px.scatter(
df,
x="Annual_Income",
y="Spending_Score",
color="Cluster",
size="Age",
hover_data=["Age"],
title="Customer Segmentation Dashboard"
)
fig.show()